---
title             : "Anchoring in the past, tweeting from the present: Cognitive bias in journalists’ word choices"
author: 
  - name          : "Jihye Lee"
    affiliation   : "1"
    corresponding : yes   
    email         : "jihyelee@stanford.edu"
  - name          : "James T. Hamilton"
    affiliation   : "1"

affiliation:
  - id            : "1"
    institution   : "Department of Communication, Stanford University"
output: html_document
---
This script generates main analyses of the project entitled "Anchoring in the past, tweeting from the present:Cognitive bias in journalists’ word choices".

```{r, load_packages, include = FALSE}
library(dplyr)
library(stringr)
library(QuantPsyc)
library(stargazer)
library(rstatix)
library(ggplot2)
library(tidyverse)
library(ggpubr)
library(scales)
library(tidytext)
library(stringr)
library(tidyr)
library(ggsignif)
library(EnvStats)
options(scipen=99)
```

#Data setup 
```{r}
#Read data sets
liwc <- read.csv("~/set_a_directory_to_import_the_data/PUBLIC_LIWC_clean_data.csv") 
data.anchoring <- read.csv("~/set_a_directory_to_import_the_data/PUBLIC_Anchoring_clean_data.csv")
```

#Table 3. Differences in language use across media
```{r}
#A series of t-test to test differences between Twitter and Papers
liwc_paper_tw <- filter(liwc, media == "Papers" | media == "Twitter")
liwc_paper_tw[] <- lapply(liwc_paper_tw, function(x) if(is.factor(x)) factor(x) else x)

t.test(Analytic ~ media, data = liwc_paper_tw)
liwc_paper_tw %>% cohens_d(Analytic~ media)

t.test(Authentic~ media, data = liwc_paper_tw)
liwc_paper_tw %>% cohens_d(Authentic~ media)

t.test(posemo~ media, data = liwc_paper_tw)
liwc_paper_tw %>% cohens_d(posemo~ media)

t.test(negemo~ media, data = liwc_paper_tw)
liwc_paper_tw %>% cohens_d(negemo~ media)

t.test(certain~ media, data = liwc_paper_tw)
liwc_paper_tw %>% cohens_d(certain~ media)

t.test(focuspresent~ media, data = liwc_paper_tw)
liwc_paper_tw %>% cohens_d(focuspresent~ media)

t.test(filler~ media, data = liwc_paper_tw)
liwc_paper_tw %>% cohens_d(filler~ media)

t.test(netspeak~ media, data = liwc_paper_tw)
liwc_paper_tw %>% cohens_d(netspeak~ media)

t.test(quant~ media, data = liwc_paper_tw)
liwc_paper_tw %>% cohens_d(quant~ media)

liwc_paper_tw %>% group_by(media) %>% summarize(reports = n(),
                                                word_count = sum(WC),
                                                mean(Analytic,na.rm=TRUE), sd(Analytic,na.rm=TRUE),
                                                mean(Authentic,na.rm=TRUE), sd(Authentic,na.rm=TRUE),
                                                mean(posemo,na.rm=TRUE), sd(posemo,na.rm=TRUE),
                                                mean(negemo,na.rm=TRUE), sd(negemo,na.rm=TRUE),
                                                mean(certain,na.rm=TRUE), sd(certain,na.rm=TRUE),
                                                mean(focuspresent,na.rm=TRUE), sd(focuspresent,na.rm=TRUE),
                                                mean(filler, na.rm=TRUE), sd(filler, na.rm=TRUE),
                                                mean(netspeak, na.rm=TRUE), sd(netspeak, na.rm=TRUE),
                                                mean(quant,na.rm=TRUE), sd(quant,na.rm=TRUE))

#A series of t-test to test differences between Twitter and broadcasts
liwc_tv_tw <-liwc[liwc$media %in% c('Broadcasts', 'Twitter'),]
liwc_tv_tw[] <- lapply(liwc_tv_tw, function(x) if(is.factor(x)) factor(x) else x)

t.test(Analytic ~ media, data = liwc_tv_tw)
liwc_tv_tw %>% cohens_d(Analytic~ media)

t.test(Authentic~ media, data = liwc_tv_tw)
liwc_tv_tw %>% cohens_d(Authentic~ media)

t.test(posemo~ media, data = liwc_tv_tw)
liwc_tv_tw %>% cohens_d(posemo~ media)

t.test(negemo~ media, data = liwc_tv_tw)
liwc_tv_tw %>% cohens_d(negemo~ media)

t.test(certain~ media, data = liwc_tv_tw)
liwc_tv_tw %>% cohens_d(certain~ media)

t.test(focuspresent~ media, data = liwc_tv_tw)
liwc_tv_tw %>% cohens_d(focuspresent~ media)

t.test(filler~ media, data = liwc_tv_tw)
liwc_tv_tw %>% cohens_d(filler~ media)

t.test(netspeak~ media, data = liwc_tv_tw)
liwc_tv_tw %>% cohens_d(netspeak~ media)

t.test(quant~ media, data = liwc_tv_tw)
liwc_tv_tw %>% cohens_d(quant~ media)

liwc_tv_tw %>% group_by(media) %>% summarize(reports = n(),
                                                word_count = sum(WC),
                                                mean(Analytic,na.rm=TRUE), sd(Analytic,na.rm=TRUE),
                                                mean(Authentic,na.rm=TRUE), sd(Authentic,na.rm=TRUE),
                                                mean(posemo,na.rm=TRUE), sd(posemo,na.rm=TRUE),
                                                mean(negemo,na.rm=TRUE), sd(negemo,na.rm=TRUE),
                                                mean(certain,na.rm=TRUE), sd(certain,na.rm=TRUE),
                                                mean(focuspresent,na.rm=TRUE), sd(focuspresent,na.rm=TRUE),
                                                mean(filler, na.rm=TRUE), sd(filler, na.rm=TRUE),
                                                mean(netspeak, na.rm=TRUE), sd(netspeak, na.rm=TRUE),
                                                mean(quant,na.rm=TRUE), sd(quant,na.rm=TRUE))

```

#Fig 2. Anchoring on prior political elections
```{R}
#Test of equal variance
fligner.test(primary_rate1000 ~ experience, data = data.anchoring )
fligner.test(pres_2012_rate1000 ~ experience, data = data.anchoring ) 
fligner.test(pres_5year_rate1000 ~ experience, data = data.anchoring ) 

#Test mean differences and calculate Cohen's d.
data.anchoring %>%
     t.test(primary_rate1000 ~ experience, data = ., var.equal=TRUE)
data.anchoring %>% cohens_d(primary_rate1000 ~ experience, var.equal = TRUE)

data.anchoring %>%
     t.test(pres_2012_rate1000 ~ experience, data = ., var.equal = FALSE)
data.anchoring %>% cohens_d(pres_2012_rate1000 ~ experience, var.equal = FALSE)

data.anchoring %>%
     t.test(pres_5year_rate1000 ~ experience, data = . , var.equal = FALSE)
data.anchoring %>% cohens_d(pres_5year_rate1000 ~ experience, var.equal = FALSE)

#Generate figures (Fig 2)
#Anchoring to the 2012 election
mean_2012 <- data.anchoring %>% group_by(experience) %>% 
                        summarise(mean_2012 = mean(pres_2012_rate1000,na.rm=TRUE),
                                  sd_2012=sd(pres_2012_rate1000,na.rm=TRUE)) %>% mutate(
                                                                                        experience = case_when(
                                                                                          experience == "0" ~ "No Experience",
                                                                                          experience == "1" ~ "Experience"
                                                                                        )) 


anchoring_2012 <- mean_2012 %>%
                      ggplot(aes(x=experience, y=mean_2012))+
                        geom_col(aes(fill = experience), color = "black", width = 0.6) +
                        geom_errorbar(aes(ymin = mean_2012 - sd_2012,
                                          ymax = mean_2012 + sd_2012),
                                      color = "#22292F",
                                      width = .1, size=1) + 
                      geom_signif(
                        annotations = c("p = .012"),
                        y_position = 8.1, xmin = 1, xmax = 2,
                        textsize = 10,
                        size = 1,
                        fontface = 'italic'
                      ) + 
                      scale_fill_manual(values = c("#E69F00", "grey40")) +
                      theme_classic() + 
                      labs(title = "2012 Presidential Election") +
                      ylab("Average references per 1,000 words") +
                      xlab("") +
                       theme(plot.title = element_text(size=32, hjust = 0.5), 
                             axis.text=element_text(size=28), 
                             axis.title=element_text(size=28),  
                             legend.text=element_text(size=28),
                             strip.text.x = element_text(size=28),
                             legend.position = "none",
                             axis.title.x = element_blank(),
                             panel.border = element_rect(colour = "black", fill=NA, size=1))

#Anchoring to the past five presidential elections
mean_pres_yr <- data.anchoring %>% group_by(experience) %>% 
                        summarise(mean_pres = mean(pres_5year_rate1000,na.rm=TRUE),
                                  sd_pres = sd(pres_5year_rate1000,na.rm=TRUE)) %>% mutate(
                                                                                        experience = case_when(
                                                                                          experience == "0" ~ "No Experience",
                                                                                          experience == "1" ~ "Experience"
                                                                                        )) 

anchoring_pres_yr <-mean_pres_yr %>%
                      ggplot(aes(x=experience, y=mean_pres))+
                        geom_col(aes(fill = experience), color = "black", width = 0.6) +
                        geom_errorbar(aes(ymin = mean_pres - sd_pres,
                                          ymax = mean_pres + sd_pres),
                                      color = "#22292F",
                                      width = .1, size=1) + 
                      geom_signif(
                        annotations = "p = .002",
                        y_position = 10, xmin = 1, xmax = 2,
                        textsize = 10,
                        size = 1,
                        fontface = 'italic'
                      ) + 
                      scale_fill_grey(start = 0.5) +
                      scale_y_continuous(breaks = seq(0, 10.5, by = 2)) +
                      theme_classic() + 
                      labs(title = "Past Five Presidential Elections") +
                      ylab("Average references per 1,000 words") +
                      xlab("") +
                      scale_fill_manual(values = c("#E69F00", "grey40")) +
                       theme(plot.title = element_text(size=32, hjust = 0.5), 
                             axis.text=element_text(size=28), 
                             axis.title=element_text(size=28),  
                             legend.text=element_text(size=28),
                             strip.text.x = element_text(size=28),
                             legend.position = "none",
                             axis.title.x = element_blank(),
                             panel.border = element_rect(colour = "black", fill=NA, size=1))

#Anchoring to the 2016 primaries
mean_2016 <- data.anchoring %>% group_by(experience) %>% 
                        summarise(mean_2016 = mean(primary_rate1000,na.rm=TRUE),
                                  sd_2016 = sd(primary_rate1000,na.rm=TRUE)) %>% mutate(
                                                                                        experience = case_when(
                                                                                          experience == "0" ~ "No Experience",
                                                                                          experience == "1" ~ "Experience"
                                                                                        )) 
anchoring_2016 <-mean_2016 %>%
                      ggplot(aes(x=experience, y=mean_2016))+
                        geom_col(aes(fill = experience), color = "black", width = 0.6) +
                        geom_errorbar(aes(ymin = mean_2016 - sd_2016,
                                          ymax = mean_2016 + sd_2016),
                                      color = "#22292F",
                                      width = .1, size=1) + 
                      geom_signif(
                        annotations = c("p = .31"),
                        y_position = 82, xmin = 1, xmax = 2,
                        textsize = 10,
                        size=1,
                        fontface = 'italic'
                      ) + 
                      scale_fill_manual(values = c("#E69F00", "grey40")) +
                      theme_classic() + 
                      labs(title = "2016 Primaries") +
                      ylab("Average references per 1,000 words") +
                      xlab("") +
                       theme(plot.title = element_text(size=32, hjust = 0.5), 
                             axis.text=element_text(size=28), 
                             axis.title=element_text(size=28),  
                             legend.text=element_text(size=28),
                             strip.text.x = element_text(size=28),
                             legend.position = "none",
                             axis.title.x = element_blank(),
                             panel.border = element_rect(colour = "black", fill=NA, size=1))
```

#Table 5. Impact of a journalist’s experience of presidential campaign coverage on reporting the 2016 presidential election
```{r}
DF <- data.anchoring %>% filter(type != "twitter")
DF <- DF[-c(2, 7, 10, 18, 28, 30, 32, 36, 39, 41, 43, 46, 50, 52, 54, 57, 59, 61, 68, 81, 86, 88, 92, 95), ] #Drop observations from non-major outlets from each journalist. 

DF <- DF %>% mutate(media_recoded = case_when((media == "paper") ~ "papers",
                                              (media == "magazine") ~ "papers",
                                              (media == "radio") ~ "broadcasts",
                                              (media == "broadcaster") ~ "broadcasts"))
DF$media_recoded <- as.factor(DF$media_recoded)
DF$female <- as.factor(DF$female)

nb_2012 <- glm.nb(pres_2012_count ~ experience + factor(media_recoded) + age + female + offset(log(1000*total_words_sum)), data = DF)
nb_5years <- glm.nb(pres_5year_count~ experience + factor(media_recoded) + age + female + offset(log(1000*total_words_sum)), data = DF)
stargazer(nb_2012,nb_5years, type= 'text')

est_2012 <- cbind(Estimate = coef(nb_2012), confint(nb_2012))
exp(est_2012)
est_5year <- cbind(Estimate = coef(nb_5years), confint(nb_5years))
exp(est_5year)
```
